Documentation for the analysis of mouse placental RNA-seq data at e7.5, e8.5 and e9.5.
Load essential files + functions + libraries
tpm2 <- read.table("/work/LAS/geetu-lab/hhvu/project1_2/rna-seq/PlacentaRNA-seq/Files/tpmForClustering.txt", header = T)
summ <- function(trans_cluster, tpm2=tpm2) {
tpm3 <- tpm2
tpm3 <- cbind(rownames(tpm2), tpm3)
rownames(tpm3) <- 1:nrow(tpm3)
colnames(tpm3)[1] <- "transcripts"
#e7.5
e7.5_table <- data.frame(tpm3$transcripts, rowMeans(tpm3[,c("E7.5_2", "E7.5_3", "E7.5_4", "E7.5_5", "E7.5_6")]))
e7.5_table$time <- c("e7.5")
rownames(e7.5_table) <- 1:nrow(e7.5_table)
colnames(e7.5_table) <- c("transcripts", "mean_cts_scaled", "time")
#e8.5
e8.5_table <- data.frame(tpm3$transcripts, rowMeans(tpm3[,c("E8.5_1", "E8.5_2", "E8.5_3", "E8.5_4", "E8.5_5", "E8.5_6")]))
e8.5_table$time <- c("e8.5")
rownames(e8.5_table) <- 1:nrow(e8.5_table)
colnames(e8.5_table) <- c("transcripts", "mean_cts_scaled", "time")
#e9.5
e9.5_table <- data.frame(tpm3$transcripts, rowMeans(tpm3[,c("E9.5_1", "E9.5_2", "E9.5_3", "E9.5_4", "E9.5_5")]))
e9.5_table$time <- c("e9.5")
rownames(e9.5_table) <- 1:nrow(e9.5_table)
colnames(e9.5_table) <- c("transcripts", "mean_cts_scaled", "time")
summary <- rbind(rbind(e7.5_table, e8.5_table), e9.5_table)
summary <- summary[order(summary$transcripts),]
summary <- inner_join(summary, trans_cluster, by = c("transcripts" = "name"))
summary <- inner_join(summary, t2g, by = c("transcripts" = "target_id"))
return(summary)
}
plotClus <- function(summary, title){
ascl2 <- subset(summary, summary$ext_gene == "Ascl2") #e7.5
gjb5 <- subset(summary, summary$ext_gene == "Gjb5") #e7.5
dnmt1 <- subset(summary, summary$ext_gene == "Dnmt1") #e8.5
itga4 <- subset(summary, summary$ext_gene == "Itga4") #e8.5
gjb2 <- subset(summary, summary$ext_gene == "Gjb2") #e9.5
igf2 <- subset(summary, summary$ext_gene == "Igf2") #e9.5
p <- ggplot(aes(time, mean_cts_scaled), data = summary) +
geom_line(aes(group = transcripts), alpha = 0.5, colour = "grey77") +
geom_line(stat = "summary", fun = "median", size = 2,
aes(group = 1, color = "Group median")) +
labs(title = title,
x = "Time point",
y = "Scaled mean transcript counts", color = "Legend", linetype = "Legend") +
theme(plot.title = element_text(size = 15, face = "bold"), legend.text=element_text(size=20)) +
geom_line(data = ascl2, size = 2.5, aes(group = transcripts, color = "Ascl2", linetype = "Ascl2"), alpha = 1) + #e7.5
geom_line(data = gjb5, size = 2, aes(group = transcripts, color = "Gjb5", linetype = "Gjb5" ), alpha = 1) + #e7.5
geom_line(data = dnmt1, size = 2, aes(group = transcripts, color = "Dnmt1", linetype = "Dnmt1"), alpha = 1) + #e8.5
geom_line(data = itga4, size = 2, aes(group = transcripts, color = "Itga4", linetype = "Itga4"), alpha = 1) + #e8.5
geom_line(data = gjb2, size = 2, aes(group = transcripts, color = "Gjb2", linetype = "Gjb2"), alpha = 1) + #e9.5
geom_line(data = igf2, size = 2, aes(group = transcripts, color = "Igf2", linetype = "Igf2"), alpha = 1) + #e9.5
scale_color_manual(name = "Legend", values = c("Ascl2" = "darkolivegreen4", "Gjb5" = "yellow3",
"Dnmt1" = "dodgerblue4", "Itga4" = "deepskyblue4",
"Gjb2" = "saddlebrown", "Igf2" = "salmon3",
"Group median" = "grey22")) +
scale_linetype_manual(name = "Legend", values = c("Ascl2" = "solid", "Gjb5" = "solid",
"Dnmt1" = "twodash", "Itga4" = "solid",
"Gjb2" = "solid", "Igf2" = "solid",
"Group median" = "solid")) +
guides(linetype=F,
colour=guide_legend(keywidth = 3, keyheight = 1)) +
theme(text = element_text(size=15),
legend.text=element_text(size=15),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(angle=0, hjust=0.5, size = 25),
axis.text.y = element_text(size = 15)) +
facet_grid(cols = vars(value))
return(p)
}
library("ggplot2", suppressMessages())
library("dplyr", suppressMessages())
library("tidyverse", suppressMessages())
t2g <- read.table("/work/LAS/geetu-lab/hhvu/project1_2/rna-seq/PlacentaRNA-seq/Files/t2g.txt", header = T, sep = "\t")
t2g <- t2g[order(t2g$target_id),]
coding <- read.table("/work/LAS/geetu-lab/hhvu/project1_2/rna-seq/PlacentaRNA-seq/Files/Mus_musculus_grcm38_coding_transcripts.txt", header = F)
hc <- hclust(dist(tpm2, method = "euclidean"), "complete")
dend <- as.dendrogram(hc)
library("clusterProfiler")
library("org.Mm.eg.db")
go <- function(subnetworkGenes) {
GO <- enrichGO(gene = subnetworkGenes, OrgDb=org.Mm.eg.db, ont = "BP", qvalueCutoff = 0.05, maxGSSize=1000, readable = T, keyType = "ENSEMBL")
goSimplify <- GO
simplify2 <- as.data.frame(goSimplify)
simplify2$GeneRatio <- sapply(strsplit(simplify2$GeneRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2]))
simplify2$BgRatio <- sapply(strsplit(simplify2$BgRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2]))
simplify2$Fold <- as.numeric(simplify2$GeneRatio)/as.numeric(simplify2$BgRatio)
simplify2 <- simplify2[simplify2$qvalue <= 0.05 & simplify2$Fold >= 2 & simplify2$Count >= 5,]
return(simplify2)
}
trans_cluster <- cutree(hc, k = 3) %>% enframe()
table(trans_cluster$value)
#>
#> 1 2 3
#> 8091 7238 8242
summK3 <- summ(trans_cluster, tpm2)
p <- plotClus(summK3, "Hierarchical Clustering of Transcripts, k = 3")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
g3 <- unique(summK3[summK3$value == "3", "ens_gene"])
length(g3)
#> [1] 5566
g3go <- go(g3)
trans_cluster <- cutree(hc, k = 4) %>% enframe()
table(trans_cluster$value)
#>
#> 1 2 3 4
#> 8091 7238 4501 3741
summK4 <- summ(trans_cluster, tpm2)
p <- plotClus(summK4, "Hierarchical Clustering of Transcripts, k = 4")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
g4 <- unique(summK4[summK4$value == "4", "ens_gene"])
length(g4)
#> [1] 3082
g4go <- go(g4)
#g4go$Description
trans_cluster <- cutree(hc, k = 5) %>% enframe()
table(trans_cluster$value)
#>
#> 1 2 3 4 5
#> 8091 7238 4501 2532 1209
summK5 <- summ(trans_cluster, tpm2)
p <- plotClus(summK5, "Hierarchical Clustering of Transcripts, k = 5")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
g4 <- unique(summK5[summK5$value == "4", "ens_gene"])
length(g4)
#> [1] 2184
g4go <- go(g4)
#g4go$Description
g5 <- unique(summK5[summK5$value == "5", "ens_gene"])
length(g5)
#> [1] 1105
g5go <- go(g5)
#g5go$Description
trans_cluster <- cutree(hc, k = 6) %>% enframe()
table(trans_cluster$value)
#>
#> 1 2 3 4 5 6
#> 4833 3258 7238 4501 2532 1209
summK6 <- summ(trans_cluster, tpm2)
p <- plotClus(summK6, "Hierarchical Clustering of Transcripts, k = 6")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
trans_cluster <- cutree(hc, k = 7) %>% enframe()
table(trans_cluster$value)
#>
#> 1 2 3 4 5 6 7
#> 4833 3258 7238 4501 1349 1209 1183
summK7 <- summ(trans_cluster, tpm2)
p <- plotClus(summK7, "Hierarchical Clustering of Transcripts, k = 7")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
trans_cluster <- cutree(hc, k = 8) %>% enframe()
table(trans_cluster$value)
#>
#> 1 2 3 4 5 6 7 8
#> 4833 3258 7238 2429 1349 1209 1183 2072
summK8 <- summ(trans_cluster, tpm2)
p <- plotClus(summK8, "Hierarchical Clustering of Transcripts, k = 8")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
trans_cluster <- cutree(hc, k = 9) %>% enframe()
table(trans_cluster$value)
#>
#> 1 2 3 4 5 6 7 8 9
#> 2838 3258 7238 2429 1349 1995 1209 1183 2072
summK9 <- summ(trans_cluster, tpm2)
p <- plotClus(summK9, "Hierarchical Clustering of Transcripts, k = 9")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
trans_cluster <- cutree(hc, k = 10) %>% enframe()
table(trans_cluster$value)
#>
#> 1 2 3 4 5 6 7 8 9 10
#> 2838 1173 7238 2429 1349 2085 1995 1209 1183 2072
summK10 <- summ(trans_cluster, tpm2)
p <- plotClus(summK10, "Hierarchical Clustering of Transcripts, k = 10")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
K = 3 hierarchical clustering
trans_cluster <- cutree(hc, k = 3) %>% enframe()
table(trans_cluster$value)
#>
#> 1 2 3
#> 8091 7238 8242
summK3 <- summ(trans_cluster, tpm2)
p <- plotClus(summK3, "Hierarchical Clustering of Transcripts, k = 3")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
summK3$value <- ifelse(summK3$value == "1", "e8.5",
ifelse(summK3$value == "2", "e9.5",
ifelse(summK3$value == "3", "e7.5", summK3$value)))
summK3$value <- ifelse(summK3$value == "e8.5", "2",
ifelse(summK3$value == "e9.5", "3",
ifelse(summK3$value == "e7.5", "1", summK3$value)))
hc1 <- summK3[summK3$value == 1,]
hc2 <- summK3[summK3$value == 2,]
hc3 <- summK3[summK3$value == 3,]
K = 3 K-means
set.seed(123)
km <- kmeans(tpm2, centers = 3)
trans_cluster_kmeans <- km$cluster %>% enframe()
summKmeans <- summ(trans_cluster_kmeans, tpm2)
p <- plotClus(summKmeans, "K-means Clustering of Transcripts")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
summKmeans$value <- ifelse(summKmeans$value == "1", "e9.5",
ifelse(summKmeans$value == "2", "e7.5",
ifelse(summKmeans$value == "3", "e8.5", summKmeans$value)))
summKmeans$value <- ifelse(summKmeans$value == "e8.5", "2",
ifelse(summKmeans$value == "e9.5", "3",
ifelse(summKmeans$value == "e7.5", "1", summKmeans$value)))
summKmeans_hc1 <- summKmeans[summKmeans$transcripts %in% hc1$transcripts,]
summKmeans_hc2 <- summKmeans[summKmeans$transcripts %in% hc2$transcripts,]
summKmeans_hc3 <- summKmeans[summKmeans$transcripts %in% hc3$transcripts,]
library("fossil")
#> Warning: package 'maps' was built under R version 4.0.3
rand.index(as.numeric(summKmeans_hc1$value), as.numeric(hc1$value))
#> [1] 0.616173
rand.index(as.numeric(summKmeans_hc2$value), as.numeric(hc2$value))
#> [1] 0.6144555
rand.index(as.numeric(summKmeans_hc3$value), as.numeric(hc3$value))
#> [1] 0.7729758
SOM
trans_cluster_som <- read.table("Files/trans_cluster_som.txt", header = T)
summSOM <- summ(trans_cluster_som, tpm2)
p <- plotClus(summSOM, "Self-organizing Maps of Transcripts")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
summSOM$value <- ifelse(summSOM$value == "1", "e8.5",
ifelse(summSOM$value == "2", "e9.5",
ifelse(summSOM$value == "3", "e7.5", summSOM$value)))
summSOM$value <- ifelse(summSOM$value == "e8.5", "2",
ifelse(summSOM$value == "e9.5", "3",
ifelse(summSOM$value == "e7.5", "1", summSOM$value)))
summSOM_hc1 <- summSOM[summSOM$transcripts %in% hc1$transcripts,]
summSOM_hc2 <- summSOM[summSOM$transcripts %in% hc2$transcripts,]
summSOM_hc3 <- summSOM[summSOM$transcripts %in% hc3$transcripts,]
rand.index(as.numeric(summSOM_hc1$value), as.numeric(hc1$value))
#> [1] 0.6110996
rand.index(as.numeric(summSOM_hc2$value), as.numeric(hc2$value))
#> [1] 0.6192385
rand.index(as.numeric(summSOM_hc3$value), as.numeric(hc3$value))
#> [1] 0.7752637
Spectral clustering
trans_cluster_sc <- read.table("Files/trans_cluster_sc.txt", header = T)
summSC <- summ(trans_cluster_sc, tpm2)
p <- plotClus(summSC, "Spectral Clustering of Transcripts")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p
summSC$value <- ifelse(summSC$value == "1", "e7.5",
ifelse(summSC$value == "2", "e8.5",
ifelse(summSC$value == "3", "e9.5", summSC$value)))
summSC$value <- ifelse(summSC$value == "e8.5", "2",
ifelse(summSC$value == "e9.5", "3",
ifelse(summSC$value == "e7.5", "1", summSC$value)))
summSC_hc1 <- summSC[summSC$transcripts %in% hc1$transcripts,]
summSC_hc2 <- summSC[summSC$transcripts %in% hc2$transcripts,]
summSC_hc3 <- summSC[summSC$transcripts %in% hc3$transcripts,]
rand.index(as.numeric(summSC_hc1$value), as.numeric(hc1$value))
#> [1] 0.6183286
rand.index(as.numeric(summSC_hc2$value), as.numeric(hc2$value))
#> [1] 0.4985626
rand.index(as.numeric(summSC_hc3$value), as.numeric(hc3$value))
#> [1] 0.8725749